Dot Pattern Variability
  • Prior Work
    • 2022_Paper
    • Hu_Nosofsky24
  • Dot Patterns
  • Similarity Ratings
    • Data Checks
    • Group Comparison
  • Task

On this page

  • Data Inspection & Sanity Checks
    • Prototype set counts
    • Rating Distributions
    • Reaction Time Distributions
    • Individual Subject Ratings
  • Lowest and Highest Rated Pairs
  • Pattern Pair Table
  • Stucture of dataframe after merging similarity ratings with CatLearn accuracy
  • View source
  • Edit this page
  • Report an issue

Dot Pattern Ratings - data checks

  • Show All Code
  • Hide All Code

  • View Source
Published

April 22, 2024

  • catLearn accuracy refers to accuracy on the categorization task in the Hu & Nosofsky 2024 study.
  • prototype sets refer to sets of 3 prototypes that correspond to a single participant in the Hu & Nosofsky 2024 study.
  • prototype pairs refer to the specific pairs of prototypes that are displayed together on a rating trial (3 pairs per set).
Code
pacman::p_load(dplyr,purrr,tidyr,ggplot2, here, patchwork, 
  conflicted, jsonlite,stringr, gt, knitr, kableExtra, 
  lubridate,ggh4x, lmerTest)
walk(c("dplyr", "lmerTest"), conflict_prefer_all, quiet = TRUE)
options(digits=2, scipen=999, dplyr.summarise.inform=FALSE)
walk(c("fun_plot"), ~ source(here::here(paste0("R/", .x, ".R"))))
mc24_proto <- read.csv(here("Stimulii","mc24_prototypes.csv")) |> mutate(set=paste0(sbjCode,"_",condit)) 
sbj_cat <- read.csv(here("data","mc24_sbj_cat.csv"))

dfiles <- list(path=list.files(here::here("data/dotSim_data"),full.names=TRUE))

d <- map_dfr(dfiles$path, ~read.csv(.x))

d <- map_dfr(dfiles$path, ~{read.csv(.x) |> 
    mutate(sfile=tools::file_path_sans_ext(basename(.x)))}) |> 
  select(-trial_index, -internal_node_id,-trial_type) |>
   mutate(set = paste(str_extract(item_label_1, "^\\d+"),
                     str_extract(item_label_1, "[a-z]+"), sep = "_")) |>
  mutate(pair_label = paste0(item_label_1,"_",item_label_2)) |>
  relocate(sbjCode,date,set,pair_label,trial,item_label_1,item_label_2,response,rt)

setCounts <- d |> 
  pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
  group_by(set) |> summarise(n=n_distinct(sbjCode),resp=mean(response),sd=sd(response)) |> arrange(desc(n))

# length(unique(mc_proto$set)) # 304
setCounts2 <- mc24_proto |> group_by(set) |> 
  slice_head(n=1) |> 
  select(id,file,set) |> 
  left_join(setCounts,by="set") |> 
  mutate(n = ifelse(is.na(n), 0, n), .groups="drop") |> 
  arrange(n) |> ungroup()

pairCounts <- d |> 
  group_by(pair_label,set) |> 
  summarise(n=n(),mean_resp=mean(response),sd=sd(response)) |> arrange(desc(n)) |> ungroup()



patternAvg <- d |> 
  pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
  group_by(item,file) |> 
  summarise(n_rating=n(),resp=mean(response),sd=sd(response)) |> 
  arrange(desc(n_rating))

cat_sim <- sbj_cat |> 
  mutate(item=item_label) |> 
  left_join(patternAvg,by=c("file","item"))  |> arrange(desc(n_rating)) |>
  #remove rows where n_rating is NA, or less than 4
  filter(!is.na(n_rating),n_rating>=12) |> 
  mutate(sim_group = ifelse(resp>6.0,"Very Similar",ifelse(resp<3.5,"Very Dissimilar","Medium"))) |> 
  mutate(sim_group=factor(sim_group,levels=c("Very Dissimilar","Medium","Very Similar"))) 



cat_sim_test <- cat_sim |> 
  filter(Phase==2) 


#cor(cat_sim$resp,cat_sim$Corr)

#  m1 <- lmer(Corr ~ resp + (1|sbjCode), data=cat_sim)
#  summary(m1)

#  m1 <- lmer(Corr ~ resp + (1|Pattern.Type) +  (1|sbjCode), data=cat_sim)
#  summary(m1)

#  m1 <- lmer(Corr ~ resp*Pattern.Type*condit +  (1|sbjCode), data=cat_sim)
#  summary(m1)


# m1 <- lmer(Corr ~ sim_group +  (1|sbjCode), data=cat_sim)
# summary(m1)

# m1 <- lmer(Corr ~ sim_group*condit +  (1|sbjCode), data=cat_sim)
# summary(m1)

# m1 <- lmer(Corr ~ sim_group*condit*Pattern.Type +  (1|sbjCode), data=cat_sim)
# summary(m1)

Data Inspection & Sanity Checks

Code
avg_set_rating <- setCounts2 |> summarise("Avg Ratings Per Set" = mean(n)) |> pull(1)

d |> 
  summarize("N Subjects" = n_distinct(sbjCode), "N Prototype Sets" = n_distinct(set)) |> 
  mutate("Avg Ratings Per Set" = avg_set_rating) |>
  kbl()
Table 1: Current counts of unique subjects, and prototype sets
N Subjects N Prototype Sets Avg Ratings Per Set
87 304 29

Prototype set counts

 

Code
setCounts2 |> group_by(n) |> summarise(nc=n()) |> rename("Number of times prototype set has been included in the study"=n, "Number of prototype sets with this count"=nc) |> gt() |> 
  tab_spanner(label = "Prototype Set Counts") |> 
  tab_header(title = "Prototype Set Counts") |> 
  tab_source_note(
    "Note: The number of times a prototype set has been included in the study is equal to the number of participants who rated the set."
  )
Prototype Set Counts
Number of times prototype set has been included in the study Number of prototype sets with this count
17 1
19 4
20 3
21 3
22 8
23 10
24 19
25 19
26 20
27 25
28 23
29 24
30 28
31 25
32 17
33 24
34 15
35 13
36 14
37 2
38 3
39 1
40 2
41 1
Note: The number of times a prototype set has been included in the study is equal to the number of participants who rated the set.
Code
# d |> filter(sbjCode==11) |> select(sbjCode,date,trial,pair_label,set,rt,time_elapsed,time)

# d |> group_by(sbjCode,set) |> 
#   summarize (n=n()) |>
#   gt()

#d |> group_by(sbjCode, item_label_1, item_label_2) |> summarise(n=n())

# (1-.33)^8
# (factorial(8)/(factorial(6)*factorial(8-6))) * (.33^6)*((1-.33)^(8-6))
# (factorial(8)/(factorial(7)*factorial(8-7))) *(.33^6)*((1-.33)^(8-7))
# (factorial(8)/(factorial(8)*factorial(8-8))) *(.33^6)*(1-.33)^(8-8)

# d |> pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
#   group_by(sbjCode, item) |> summarise(n=n())

# patternCounts <- d |> pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
#   group_by(item) |> summarise(n=n(),resp=mean(response),sd=sd(response)) |> arrange(desc(n))



# d |> 
#     pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> select(sbjCode,set,pair_label,item_label,item,response) |>  group_by(set) |>
#     summarize(n=n_distinct(sbjCode)) |> arrange(desc(n)) 




# d |> group_by(sbjCode, file) |> summarise(n=n())
# d |> group_by(sbjCode, set) |> summarise(n=n())

# d |> group_by(sbjCode) |> summarise(n_distinct(file))
# d |> group_by(sbjCode) |> summarise(n_distinct(set))


# sp <- setCounts2 |> 
#   mutate(set=reorder(set,n)) |>
#   ggplot(aes(x=set,y=n)) +
#    geom_col() +
#    theme(legend.title=element_blank(),
#       axis.text.x = element_text(size=5,angle = 90, hjust = 0.5, vjust = 0.5)) +
#     labs(x="Prototype Set", y="Number of Participants to rate set") 

sh <- setCounts2 |> 
  ggplot(aes(x=n)) + geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks=seq(0, max(setCounts2$n), by = 1)) +
  geom_text(stat="count", aes(label=..count..), vjust=-0.5) +
  labs(x="Number of times prototype set has been included in the study", 
  y="Number of prototype sets for each count") 


#sp/sh

sh

Prototype set counts

Prototype set counts

 

Rating Distributions

Code
pgr <- d |> 
  ggplot(aes(x=response))+geom_histogram(binwidth=1) + 
      scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) + labs(title="Aggregate Rating Distribution", x="Rating", y="Count") 

pir <- d |>  ggplot(aes(x=response))+
      geom_histogram(binwidth=1) + 
      facet_wrap(~sbjCode) + 
      scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) + labs(title="Rating Distribution per Sbj.", x="Rating", y="Count") 

pgr/pir

Rating distributions

Rating distributions

Reaction Time Distributions

Code
prtg <- d |> ggplot(aes(x=rt))+
  geom_density() + 
  labs(title="Aggregate Reaction Time Distribution", x="Reaction Time (ms)", y="Density")

prtid <- d |> ggplot(aes(x=rt))+geom_density() + 
  facet_wrap(~sbjCode,scale="free_x") + labs(title="Reaction Time Distribution per Sbj.", x="Reaction Time (ms)", y="Density")

prtg/prtid

Reaction time distributions

Reaction time distributions

Individual Subject Ratings

Code
# d |> summarize(n=n(), n_distinct(sbjCode), n_distinct(file), n_distinct(set), n_distinct(trial), n_distinct(item_label_1), n_distinct(item_label_2))


# d %>%
#   filter(sbjCode == 11) %>%
#   select(sbjCode, date, trial, set, rt, time) %>%
#   mutate(time_parsed = parse_date_time(paste(date, time), orders = c("mdY IMS p", "mdy IMS p"))) %>%
#   group_by(sbjCode, date) %>%
#   summarise(start_time = min(time_parsed), end_time = max(time_parsed)) %>%
#   mutate(endTimeMinusStart = end_time - start_time)

plot_hist_sbj <- function(id) {
  d |> filter(sbjCode==id) |>
    ggplot(aes(x = response)) +
    geom_histogram(binwidth=1,fill = 'dodgerblue4') +
    scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) +
    theme_minimal() +
    theme(axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text.x=element_text(size=26))  
}


sbj_sum <- d |> group_by(sbjCode) |> 
#filter(sbjCode<5) |>
  mutate(time_parsed = parse_date_time(paste(date, time), orders = c("mdY IMS p", "mdy IMS p"))) |>
  summarize ("Mean Rating"=mean(response),
  "SD Rating"=sd(response), 
  "Mean RT"=mean(rt)/1000, 
  #"Total Time (min)" = max(time_elapsed)/60000,
  "Total Time (min)" = round(difftime(max(time_parsed), min(time_parsed), units = "mins"),1),
  n_prototype_sets = n_distinct(set), 
  "N Trials" = n_distinct(trial)) |> 
  mutate("Response_Distribution"=sbjCode) 

  sbj_sum |> gt() |> 
    text_transform(
    locations = cells_body(columns = 'Response_Distribution'),
    fn = function(column) {
      map(column, plot_hist_sbj) |>
        ggplot_image(height = px(80), aspect_ratio = 3)
    }
    )
sbj_sum |> mutate(total_time=as.numeric(`Total Time (min)`)) |> 
  rename("Subject"=sbjCode, "N_Sets" = n_prototype_sets) |> 
  summarize("Median Completition time (min)"=median(total_time), 
  "Average Completion Time" = mean(total_time), 
  "Min Completion Time (min)" = min(total_time), 
  "Max Completion Time (min)" = max(total_time)) |> kbl()
Table 2: Individual Subject Ratings
sbjCode Mean Rating SD Rating Mean RT Total Time (min) n_prototype_sets N Trials Response_Distribution
2 6.3 2.0 2.22 14.7 101 303
3 4.7 1.6 1.62 11.6 101 303
4 4.5 1.6 3.52 21.1 101 303
5 4.0 2.0 2.02 13.5 101 303
7 5.1 2.8 2.79 17.4 101 303
8 5.0 2.4 2.10 13.9 101 303
9 4.7 2.1 2.81 17.2 101 303
10 4.6 1.7 2.45 15.8 101 303
11 2.8 1.8 5.80 32.4 101 303
12 4.3 2.6 3.76 22.2 101 303
13 5.8 2.4 1.21 9.5 101 303
14 4.5 1.8 3.74 22 101 303
15 5.1 2.3 2.11 14 101 303
16 6.1 2.1 1.62 11.5 101 303
17 5.5 2.7 1.05 8.7 101 303
18 2.5 2.5 2.13 14.2 101 303
19 4.4 2.3 1.91 13 101 303
20 5.7 1.6 2.18 14.4 101 303
21 4.6 2.3 3.93 23.1 101 303
22 4.4 1.6 3.44 20.6 101 303
23 3.3 2.5 2.00 13.5 101 303
24 4.7 2.3 1.51 11.1 101 303
25 5.2 1.6 3.06 18.8 101 303
26 4.7 2.0 1.83 12.6 101 303
27 4.5 2.6 2.69 17 101 303
28 4.7 2.1 6.25 34.9 101 303
29 4.9 2.2 2.75 17.2 101 303
30 5.3 1.9 0.65 6.6 101 303
31 6.6 2.4 1.14 9 101 303
32 4.0 1.8 6.43 35.7 101 303
33 5.1 3.3 1.40 10.3 101 303
34 5.2 2.5 2.77 17.4 101 303
35 4.7 2.2 3.07 18.9 101 303
36 6.3 2.0 1.46 10.7 101 303
37 5.8 1.9 1.42 10.6 101 303
38 5.8 2.4 1.51 10.8 101 303
39 5.7 1.7 2.61 16.2 101 303
40 4.1 2.0 1.97 13.3 101 303
41 5.1 2.4 3.53 21.1 101 303
42 4.8 1.4 3.57 21.4 101 303
43 5.3 2.0 1.82 12.4 101 303
44 3.3 2.6 2.05 13.7 101 303
45 4.6 1.9 3.31 20.1 101 303
46 5.7 1.6 4.92 28 101 303
48 4.7 2.4 3.26 1328 170 303
49 3.7 2.0 3.07 18.6 101 303
50 4.5 1.8 3.05 18.8 101 303
51 4.8 2.7 4.13 24.1 101 303
52 5.2 2.0 3.45 20.7 101 303
53 6.6 1.9 1.74 12.2 101 303
54 4.7 1.7 1.93 13.1 101 303
55 4.7 2.2 3.59 21.4 101 303
56 5.6 1.5 1.89 12.9 101 303
57 4.7 1.7 5.53 31.3 101 303
58 5.4 2.1 1.90 12.9 101 303
59 4.3 1.7 2.40 15.5 101 303
60 6.0 2.5 0.66 6.8 101 303
61 4.9 1.6 2.25 14.3 101 303
62 5.0 2.4 1.73 12 101 303
63 4.7 2.1 3.77 22.3 101 303
64 4.0 2.0 2.31 15.1 101 303
65 4.0 2.0 2.98 18.1 101 303
66 5.0 3.4 3.10 19 101 303
67 4.5 2.5 2.18 14.3 101 303
68 4.1 2.3 2.52 16.1 101 303
69 4.2 2.2 1.90 13 101 303
70 4.3 2.0 2.46 15.8 101 303
71 5.1 1.9 3.49 21 101 303
72 4.7 2.0 2.12 13.8 101 303
73 5.6 2.4 1.39 10.3 101 303
74 5.5 1.9 2.90 17.8 101 303
75 5.1 1.6 1.67 11.8 101 303
76 4.8 1.8 2.44 15.6 101 303
77 3.9 2.3 2.35 15.2 101 303
78 5.0 2.0 2.61 16.5 101 303
79 2.7 2.0 6.03 33.7 101 303
80 5.3 1.6 1.45 10.7 101 303
81 4.8 1.8 1.52 10.9 101 303
82 5.0 3.1 0.54 6.1 101 303
83 4.3 2.3 2.23 14.6 101 303
84 4.8 2.2 1.60 11.5 101 303
85 4.7 3.3 1.68 11.8 101 303
86 5.1 2.3 2.46 15.8 101 303
87 3.1 2.5 2.83 17.7 101 303
88 4.1 2.7 1.24 9.6 101 303
89 4.4 1.3 2.40 15.4 101 303
90 4.9 2.7 2.79 17.4 101 303
Median Completition time (min) Average Completion Time Min Completion Time (min) Max Completion Time (min)
15 31 6.1 1328

Lowest and Highest Rated Pairs

Code
# patternCounts |> filter(n>=8) |>  slice_min(resp)
# patternCounts |> filter(n>=8) |>  slice_max(resp)

# setCounts |> filter(n>=24) |>  slice_min(resp)
# setCounts |> filter(n>=24) |>  slice_max(resp)


# pairCounts |> filter(n>=5) |>  slice_min(resp,n=2)
# pairCounts |> filter(n>=5) |>  slice_max(resp)

min_resp=7
n_show=3

d %>% filter(pair_label %in% {pairCounts |> filter(n>=min_resp) |>  
  slice_min(mean_resp,n=n_show, with_ties=FALSE) |> pull(pair_label)} ) |>
  group_by(pair_label) |>
  slice_head(n=1) %>%
  plot_dotsAll() + 
  plot_annotation(title=glue::glue("Lowest rated pairs ( out of sets with n>={min_resp} ratings)"), theme = theme(plot.title = element_text(hjust = 0.4)))

Code
d %>% filter(pair_label %in% {pairCounts |> filter(n>=min_resp) |>  
  slice_max(mean_resp,n=n_show, with_ties=FALSE) |> pull(pair_label)} ) |>
  group_by(pair_label) |>
  slice_head(n=1) %>%
  plot_dotsAll() +  
  plot_annotation(title=glue::glue("Highest rated pairs ( out of sets with n>={min_resp} ratings)"), theme = theme(plot.title = element_text(hjust = 0.4)))

Pattern Pair Table

All pairs with >=25 ratings

  • click on column headers to change sort order
    • e.g. clicking on “Mean Rating” will toggle showing the pairs rated most similar or most dissimilar
    • clicking on “SD” will toggle showing the pairs with the most or least agreement in ratings
  • note your screen may need to be at full width to see all columns
Code
plot_hist_pair <- function(Pair) {
  d |> filter(pair_label==Pair) |>
    ggplot(aes(x = response)) +
    geom_histogram(binwidth=1,fill = 'dodgerblue4') +
    scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) +
    theme_minimal() +
    theme(axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text.x=element_text(size=26))  
}



pat_table_plot <- function(Pair){

  df <- d |> filter(pair_label==Pair) |> slice_head(n=1) 

    pat1 <- df %>%
          mutate(pattern_1 = purrr::map(pattern_1, jsonlite::fromJSON)) %>%
          unnest(pattern_1) %>%
          mutate(y=-y, pat=item_label_1) |> select(pair_label,x,y,pat)

    pat2 <- df %>%
          mutate(pattern_2 = purrr::map(pattern_2, jsonlite::fromJSON)) %>%
          unnest(pattern_2) %>%
          mutate(y=-y, pat=item_label_2) |> select(pair_label,x,y,pat)

    pat <- rbind(pat1,pat2)

     pat |> 
    ggplot(aes(x = x, y = y,fill=pat,col=pat)) +
          geom_point(alpha=2) +
          coord_cartesian(xlim = c(-25, 35.5), ylim = c(-25, 25)) +
          theme_minimal() +
          facet_wrap2(~pat,ncol=2,axes="all") + 
          #theme_blank +
          theme_void() + 
          theme(strip.text = element_text(size = 7,hjust=.5),
                panel.spacing.x=unit(-7.3, "lines"), 
                #strip.background = element_rect(colour = "black", linewidth = 2),
                legend.position = "none",
        axis.line.y = element_line(colour = "black", linewidth = .1)) 
}



p5 <- pairCounts |> filter(n>=34) 

p5 |> 
arrange(mean_resp) |>
relocate(pair_label,.after=sd) |>
rename("Pair"=pair_label, "N"=n, "Mean Rating"=mean_resp, "SD"=sd) |>
mutate(Rating_Dist=Pair) |> 
#group_by(Pair) |> 
gt() |> 
tab_options(table.font.size = px(8L)) |>
  cols_width(
    set ~ px(116),
    Pair ~ px(415),
    `N` ~ px(50),
    `Mean Rating` ~ px(90),
    SD ~ px(55)
  ) |>  
  fmt_number(decimals = 1) |> #fmt_integer() |>
  cols_align('left', columns = set) |> 
  text_transform(
    locations = cells_body(columns = Pair),
    fn = function(column) {
      map(column, pat_table_plot) |>
        ggplot_image(height = px(230), aspect_ratio = 1.8)
    }
  ) |>
  text_transform(
    locations = cells_body(columns = Rating_Dist),
    fn = function(column) {
      map(column, plot_hist_pair) |>
        ggplot_image(height = px(150), aspect_ratio = 1)
    }
  ) |>  
   opt_interactive(page_size_default=5, 
     use_page_size_select= TRUE, use_search=TRUE, use_resizers=TRUE,use_filters=TRUE, page_size_values = c(5, 10, 25, 50, 100)) 
Code
# #| fig-cap: dot plots
# #| fig-width: 10
# #| fig-height: 12

# d %>% filter(trial==1) %>%
#   plot_dots()

# d %>% filter(trial==1) %>%
#   plot_dots2()

# d %>% filter(trial<2) %>%
#   plot_dotsAll()
Code
#| fig-width: 6
#| fig-height: 9


# d %>% filter(file %in% unique(d$file[1])) %>%
#   plot_dotsAll()

# plot_dotsAll_orig <- function(df) {
#   plots <- list()

#   for (i in 1:nrow(df)) {
#     p1 <- df[i, ] %>%
#       pivot_longer(cols = starts_with("x"), names_to = "dot", values_to = "x") %>%
#       mutate(dot = as.numeric(str_remove(dot, "x"))) %>%
#       pivot_longer(cols = starts_with("y"), names_to = "dot2", values_to = "y") %>%
#       mutate(dot2 = as.numeric(str_remove(dot2, "y"))) %>%
#       filter(dot == dot2) %>%
#       ggplot(aes(x = x, y = y)) +
#       geom_point() +
#       coord_cartesian(xlim = c(-25, 25), ylim = c(-25, 25)) +
#       theme_minimal() +
#       labs(title = df$id[i]) + theme_blank

#     plots <- append(plots, list(p1))
#   }

#   patchwork::wrap_plots(plots, ncol = 1)
# }

# mc24_proto |> filter(file %in% unique(d$file[1])) %>% plot_dotsAll_orig()

Stucture of dataframe after merging similarity ratings with CatLearn accuracy

  • Each subject in the 2024 study has a similarity score for each of their 3 categories. (averaged over 2 comparisons with that categories prototype)
  • The same category similarity scores are then compared to their accuracy for each of the 5 Pattern Tyeps (old, prototype, new low, new med, new high)
  • 5 Pattern types * 3 Categories = 15 comparisons per subject
Table 3: Example of how data is structure after combining similarity ratings with CatLearn accuracy. 5 Pattern types * 3 Categories = 15 rows per subject in the 2024 study
sbjCode condit Category Pattern.Type n CatLearn Accuracy n_rating Category Similarity sd item
316 high 1 prototype 1 1 60 5.1 2.6 316_high_1_285
316 high 1 old 9 0.4 60 5.1 2.6 316_high_1_285
316 high 1 new_low 3 0.3 60 5.1 2.6 316_high_1_285
316 high 1 new_med 6 0.3 60 5.1 2.6 316_high_1_285
316 high 1 new_high 9 0.4 60 5.1 2.6 316_high_1_285
316 high 2 prototype 1 0 60 3.8 2 316_high_2_327
316 high 2 old 9 0.7 60 3.8 2 316_high_2_327
316 high 2 new_low 3 0.7 60 3.8 2 316_high_2_327
316 high 2 new_med 6 0.2 60 3.8 2 316_high_2_327
316 high 2 new_high 9 0.2 60 3.8 2 316_high_2_327
316 high 3 prototype 1 0 60 5.2 2.4 316_high_3_287
316 high 3 old 9 0.1 60 5.2 2.4 316_high_3_287
316 high 3 new_low 3 0.7 60 5.2 2.4 316_high_3_287
316 high 3 new_med 6 0 60 5.2 2.4 316_high_3_287
316 high 3 new_high 9 0.3 60 5.2 2.4 316_high_3_287
Code
cat_sim_test %>% # round all numerics except sbjCode to 2 decimal places
 mutate(across(where(is.numeric), ~round(., 1))) |> select(-id,-sim_group,-item_label) |> 
  relocate(item,file, .after=sd) |>
  select(-Phase,-Block) |> 
  rename("Category Similarity" = resp, "CatLearn Accuracy" = Corr) |>
   DT::datatable(options = list(pageLength = 6))
# cat_sim_test %>% # round all numerics except sbjCode to 2 decimal places
#      mutate(across(where(is.numeric), ~round(., 1))) |> select(-id,-sim_group,-item_label) |> 
#      filter(sbjCode==316) |> 
#       relocate(item,file, .after=sd) |>
#       select(-Phase,-Block, -file) |> 
#       rename("Category Similarity" = resp, "CatLearn Accuracy" = Corr) |> pander::pandoc.table(style="rmarkdown",split.table=Inf)
Table 4: Example of how data is structure after combining similarity ratings with CatLearn accuracy. 5 Pattern types * 3 Categories = 15 rows per subject in the 2024 study
Source Code
---
title: Dot Pattern Ratings - data checks
date: last-modified
lightbox: true
toc: true
page-layout: full
toc-depth: 3
code-fold: true
code-tools: true
execute: 
  warning: false
  eval: true
---


- catLearn accuracy refers to accuracy on the categorization task in the Hu & Nosofsky 2024 study.
- prototype sets refer to sets of 3 prototypes that correspond to a single participant in the Hu & Nosofsky 2024 study. 
- prototype pairs refer to the specific pairs of prototypes that are displayed together on a rating trial (3 pairs per set). 


```{r}
pacman::p_load(dplyr,purrr,tidyr,ggplot2, here, patchwork, 
  conflicted, jsonlite,stringr, gt, knitr, kableExtra, 
  lubridate,ggh4x, lmerTest)
walk(c("dplyr", "lmerTest"), conflict_prefer_all, quiet = TRUE)
options(digits=2, scipen=999, dplyr.summarise.inform=FALSE)
walk(c("fun_plot"), ~ source(here::here(paste0("R/", .x, ".R"))))
mc24_proto <- read.csv(here("Stimulii","mc24_prototypes.csv")) |> mutate(set=paste0(sbjCode,"_",condit)) 
sbj_cat <- read.csv(here("data","mc24_sbj_cat.csv"))

dfiles <- list(path=list.files(here::here("data/dotSim_data"),full.names=TRUE))

d <- map_dfr(dfiles$path, ~read.csv(.x))

d <- map_dfr(dfiles$path, ~{read.csv(.x) |> 
    mutate(sfile=tools::file_path_sans_ext(basename(.x)))}) |> 
  select(-trial_index, -internal_node_id,-trial_type) |>
   mutate(set = paste(str_extract(item_label_1, "^\\d+"),
                     str_extract(item_label_1, "[a-z]+"), sep = "_")) |>
  mutate(pair_label = paste0(item_label_1,"_",item_label_2)) |>
  relocate(sbjCode,date,set,pair_label,trial,item_label_1,item_label_2,response,rt)

setCounts <- d |> 
  pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
  group_by(set) |> summarise(n=n_distinct(sbjCode),resp=mean(response),sd=sd(response)) |> arrange(desc(n))

# length(unique(mc_proto$set)) # 304
setCounts2 <- mc24_proto |> group_by(set) |> 
  slice_head(n=1) |> 
  select(id,file,set) |> 
  left_join(setCounts,by="set") |> 
  mutate(n = ifelse(is.na(n), 0, n), .groups="drop") |> 
  arrange(n) |> ungroup()

pairCounts <- d |> 
  group_by(pair_label,set) |> 
  summarise(n=n(),mean_resp=mean(response),sd=sd(response)) |> arrange(desc(n)) |> ungroup()



patternAvg <- d |> 
  pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
  group_by(item,file) |> 
  summarise(n_rating=n(),resp=mean(response),sd=sd(response)) |> 
  arrange(desc(n_rating))

cat_sim <- sbj_cat |> 
  mutate(item=item_label) |> 
  left_join(patternAvg,by=c("file","item"))  |> arrange(desc(n_rating)) |>
  #remove rows where n_rating is NA, or less than 4
  filter(!is.na(n_rating),n_rating>=12) |> 
  mutate(sim_group = ifelse(resp>6.0,"Very Similar",ifelse(resp<3.5,"Very Dissimilar","Medium"))) |> 
  mutate(sim_group=factor(sim_group,levels=c("Very Dissimilar","Medium","Very Similar"))) 



cat_sim_test <- cat_sim |> 
  filter(Phase==2) 


#cor(cat_sim$resp,cat_sim$Corr)

#  m1 <- lmer(Corr ~ resp + (1|sbjCode), data=cat_sim)
#  summary(m1)

#  m1 <- lmer(Corr ~ resp + (1|Pattern.Type) +  (1|sbjCode), data=cat_sim)
#  summary(m1)

#  m1 <- lmer(Corr ~ resp*Pattern.Type*condit +  (1|sbjCode), data=cat_sim)
#  summary(m1)


# m1 <- lmer(Corr ~ sim_group +  (1|sbjCode), data=cat_sim)
# summary(m1)

# m1 <- lmer(Corr ~ sim_group*condit +  (1|sbjCode), data=cat_sim)
# summary(m1)

# m1 <- lmer(Corr ~ sim_group*condit*Pattern.Type +  (1|sbjCode), data=cat_sim)
# summary(m1)





```





## Data Inspection & Sanity Checks

```{r}
#| label: tbl-totals
#| tbl-cap: "Current counts of unique subjects, and prototype sets"

avg_set_rating <- setCounts2 |> summarise("Avg Ratings Per Set" = mean(n)) |> pull(1)

d |> 
  summarize("N Subjects" = n_distinct(sbjCode), "N Prototype Sets" = n_distinct(set)) |> 
  mutate("Avg Ratings Per Set" = avg_set_rating) |>
  kbl()

```

### Prototype set counts

\ \

```{r}
setCounts2 |> group_by(n) |> summarise(nc=n()) |> rename("Number of times prototype set has been included in the study"=n, "Number of prototype sets with this count"=nc) |> gt() |> 
  tab_spanner(label = "Prototype Set Counts") |> 
  tab_header(title = "Prototype Set Counts") |> 
  tab_source_note(
    "Note: The number of times a prototype set has been included in the study is equal to the number of participants who rated the set."
  )
```


```{r}
#| fig-cap: Prototype set counts
#| fig-width: 12
#| fig-height: 9



# d |> filter(sbjCode==11) |> select(sbjCode,date,trial,pair_label,set,rt,time_elapsed,time)

# d |> group_by(sbjCode,set) |> 
#   summarize (n=n()) |>
#   gt()

#d |> group_by(sbjCode, item_label_1, item_label_2) |> summarise(n=n())

# (1-.33)^8
# (factorial(8)/(factorial(6)*factorial(8-6))) * (.33^6)*((1-.33)^(8-6))
# (factorial(8)/(factorial(7)*factorial(8-7))) *(.33^6)*((1-.33)^(8-7))
# (factorial(8)/(factorial(8)*factorial(8-8))) *(.33^6)*(1-.33)^(8-8)

# d |> pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
#   group_by(sbjCode, item) |> summarise(n=n())

# patternCounts <- d |> pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
#   group_by(item) |> summarise(n=n(),resp=mean(response),sd=sd(response)) |> arrange(desc(n))



# d |> 
#     pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> select(sbjCode,set,pair_label,item_label,item,response) |>  group_by(set) |>
#     summarize(n=n_distinct(sbjCode)) |> arrange(desc(n)) 




# d |> group_by(sbjCode, file) |> summarise(n=n())
# d |> group_by(sbjCode, set) |> summarise(n=n())

# d |> group_by(sbjCode) |> summarise(n_distinct(file))
# d |> group_by(sbjCode) |> summarise(n_distinct(set))


# sp <- setCounts2 |> 
#   mutate(set=reorder(set,n)) |>
#   ggplot(aes(x=set,y=n)) +
#    geom_col() +
#    theme(legend.title=element_blank(),
#       axis.text.x = element_text(size=5,angle = 90, hjust = 0.5, vjust = 0.5)) +
#     labs(x="Prototype Set", y="Number of Participants to rate set") 

sh <- setCounts2 |> 
  ggplot(aes(x=n)) + geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks=seq(0, max(setCounts2$n), by = 1)) +
  geom_text(stat="count", aes(label=..count..), vjust=-0.5) +
  labs(x="Number of times prototype set has been included in the study", 
  y="Number of prototype sets for each count") 


#sp/sh

sh

```



\ \

### Rating Distributions


```{r}
#| fig-cap: Rating distributions
#| fig-width: 12
#| fig-height: 14

pgr <- d |> 
  ggplot(aes(x=response))+geom_histogram(binwidth=1) + 
      scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) + labs(title="Aggregate Rating Distribution", x="Rating", y="Count") 

pir <- d |>  ggplot(aes(x=response))+
      geom_histogram(binwidth=1) + 
      facet_wrap(~sbjCode) + 
      scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) + labs(title="Rating Distribution per Sbj.", x="Rating", y="Count") 

pgr/pir

```


### Reaction Time Distributions

```{r}
#| fig-cap: Reaction time distributions
#| fig-width: 12
#| fig-height: 14


prtg <- d |> ggplot(aes(x=rt))+
  geom_density() + 
  labs(title="Aggregate Reaction Time Distribution", x="Reaction Time (ms)", y="Density")

prtid <- d |> ggplot(aes(x=rt))+geom_density() + 
  facet_wrap(~sbjCode,scale="free_x") + labs(title="Reaction Time Distribution per Sbj.", x="Reaction Time (ms)", y="Density")

prtg/prtid

```


### Individual Subject Ratings

```{r}
#| label: tbl-indv-sbj
#| tbl-cap: "Individual Subject Ratings "


# d |> summarize(n=n(), n_distinct(sbjCode), n_distinct(file), n_distinct(set), n_distinct(trial), n_distinct(item_label_1), n_distinct(item_label_2))


# d %>%
#   filter(sbjCode == 11) %>%
#   select(sbjCode, date, trial, set, rt, time) %>%
#   mutate(time_parsed = parse_date_time(paste(date, time), orders = c("mdY IMS p", "mdy IMS p"))) %>%
#   group_by(sbjCode, date) %>%
#   summarise(start_time = min(time_parsed), end_time = max(time_parsed)) %>%
#   mutate(endTimeMinusStart = end_time - start_time)

plot_hist_sbj <- function(id) {
  d |> filter(sbjCode==id) |>
    ggplot(aes(x = response)) +
    geom_histogram(binwidth=1,fill = 'dodgerblue4') +
    scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) +
    theme_minimal() +
    theme(axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text.x=element_text(size=26))  
}


sbj_sum <- d |> group_by(sbjCode) |> 
#filter(sbjCode<5) |>
  mutate(time_parsed = parse_date_time(paste(date, time), orders = c("mdY IMS p", "mdy IMS p"))) |>
  summarize ("Mean Rating"=mean(response),
  "SD Rating"=sd(response), 
  "Mean RT"=mean(rt)/1000, 
  #"Total Time (min)" = max(time_elapsed)/60000,
  "Total Time (min)" = round(difftime(max(time_parsed), min(time_parsed), units = "mins"),1),
  n_prototype_sets = n_distinct(set), 
  "N Trials" = n_distinct(trial)) |> 
  mutate("Response_Distribution"=sbjCode) 

  sbj_sum |> gt() |> 
    text_transform(
    locations = cells_body(columns = 'Response_Distribution'),
    fn = function(column) {
      map(column, plot_hist_sbj) |>
        ggplot_image(height = px(80), aspect_ratio = 3)
    }
    )
  

sbj_sum |> mutate(total_time=as.numeric(`Total Time (min)`)) |> 
  rename("Subject"=sbjCode, "N_Sets" = n_prototype_sets) |> 
  summarize("Median Completition time (min)"=median(total_time), 
  "Average Completion Time" = mean(total_time), 
  "Min Completion Time (min)" = min(total_time), 
  "Max Completion Time (min)" = max(total_time)) |> kbl()

```




## Lowest and Highest Rated Pairs

```{r}
#| fig-width: 11
#| fig-height: 9

# patternCounts |> filter(n>=8) |>  slice_min(resp)
# patternCounts |> filter(n>=8) |>  slice_max(resp)

# setCounts |> filter(n>=24) |>  slice_min(resp)
# setCounts |> filter(n>=24) |>  slice_max(resp)


# pairCounts |> filter(n>=5) |>  slice_min(resp,n=2)
# pairCounts |> filter(n>=5) |>  slice_max(resp)

min_resp=7
n_show=3

d %>% filter(pair_label %in% {pairCounts |> filter(n>=min_resp) |>  
  slice_min(mean_resp,n=n_show, with_ties=FALSE) |> pull(pair_label)} ) |>
  group_by(pair_label) |>
  slice_head(n=1) %>%
  plot_dotsAll() + 
  plot_annotation(title=glue::glue("Lowest rated pairs ( out of sets with n>={min_resp} ratings)"), theme = theme(plot.title = element_text(hjust = 0.4)))


d %>% filter(pair_label %in% {pairCounts |> filter(n>=min_resp) |>  
  slice_max(mean_resp,n=n_show, with_ties=FALSE) |> pull(pair_label)} ) |>
  group_by(pair_label) |>
  slice_head(n=1) %>%
  plot_dotsAll() +  
  plot_annotation(title=glue::glue("Highest rated pairs ( out of sets with n>={min_resp} ratings)"), theme = theme(plot.title = element_text(hjust = 0.4)))


```


## Pattern Pair Table
**All pairs with >=25 ratings**

- click on column headers to change sort order
  - e.g. clicking on "Mean Rating" will toggle showing the pairs rated most similar or most dissimilar
  - clicking on "SD" will toggle showing the pairs with the most or least agreement in ratings
- **note your screen may need to be at full width to see all columns**

::: column-page-right

```{r}
plot_hist_pair <- function(Pair) {
  d |> filter(pair_label==Pair) |>
    ggplot(aes(x = response)) +
    geom_histogram(binwidth=1,fill = 'dodgerblue4') +
    scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) +
    theme_minimal() +
    theme(axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text.x=element_text(size=26))  
}



pat_table_plot <- function(Pair){

  df <- d |> filter(pair_label==Pair) |> slice_head(n=1) 

    pat1 <- df %>%
          mutate(pattern_1 = purrr::map(pattern_1, jsonlite::fromJSON)) %>%
          unnest(pattern_1) %>%
          mutate(y=-y, pat=item_label_1) |> select(pair_label,x,y,pat)

    pat2 <- df %>%
          mutate(pattern_2 = purrr::map(pattern_2, jsonlite::fromJSON)) %>%
          unnest(pattern_2) %>%
          mutate(y=-y, pat=item_label_2) |> select(pair_label,x,y,pat)


    pat <- rbind(pat1,pat2)

     pat |> 
    ggplot(aes(x = x, y = y,fill=pat,col=pat)) +
          geom_point(alpha=2) +
          coord_cartesian(xlim = c(-25, 35.5), ylim = c(-25, 25)) +
          theme_minimal() +
          facet_wrap2(~pat,ncol=2,axes="all") + 
          #theme_blank +
          theme_void() + 
          theme(strip.text = element_text(size = 7,hjust=.5),
                panel.spacing.x=unit(-7.3, "lines"), 
                #strip.background = element_rect(colour = "black", linewidth = 2),
                legend.position = "none",
        axis.line.y = element_line(colour = "black", linewidth = .1)) 
}



p5 <- pairCounts |> filter(n>=34) 

p5 |> 
arrange(mean_resp) |>
relocate(pair_label,.after=sd) |>
rename("Pair"=pair_label, "N"=n, "Mean Rating"=mean_resp, "SD"=sd) |>
mutate(Rating_Dist=Pair) |> 
#group_by(Pair) |> 
gt() |> 
tab_options(table.font.size = px(8L)) |>
  cols_width(
    set ~ px(116),
    Pair ~ px(415),
    `N` ~ px(50),
    `Mean Rating` ~ px(90),
    SD ~ px(55)
  ) |>  
  fmt_number(decimals = 1) |> #fmt_integer() |>
  cols_align('left', columns = set) |> 
  text_transform(
    locations = cells_body(columns = Pair),
    fn = function(column) {
      map(column, pat_table_plot) |>
        ggplot_image(height = px(230), aspect_ratio = 1.8)
    }
  ) |>
  text_transform(
    locations = cells_body(columns = Rating_Dist),
    fn = function(column) {
      map(column, plot_hist_pair) |>
        ggplot_image(height = px(150), aspect_ratio = 1)
    }
  ) |>  
   opt_interactive(page_size_default=5, 
     use_page_size_select= TRUE, use_search=TRUE, use_resizers=TRUE,use_filters=TRUE, page_size_values = c(5, 10, 25, 50, 100)) 
    

```

:::










<!-- ### Plot Pairs -->
```{r}
# #| fig-cap: dot plots
# #| fig-width: 10
# #| fig-height: 12

# d %>% filter(trial==1) %>%
#   plot_dots()

# d %>% filter(trial==1) %>%
#   plot_dots2()

# d %>% filter(trial<2) %>%
#   plot_dotsAll()

```



<!-- ### Compare to original prototypes -->
```{r}
#| fig-width: 6
#| fig-height: 9


# d %>% filter(file %in% unique(d$file[1])) %>%
#   plot_dotsAll()

# plot_dotsAll_orig <- function(df) {
#   plots <- list()

#   for (i in 1:nrow(df)) {
#     p1 <- df[i, ] %>%
#       pivot_longer(cols = starts_with("x"), names_to = "dot", values_to = "x") %>%
#       mutate(dot = as.numeric(str_remove(dot, "x"))) %>%
#       pivot_longer(cols = starts_with("y"), names_to = "dot2", values_to = "y") %>%
#       mutate(dot2 = as.numeric(str_remove(dot2, "y"))) %>%
#       filter(dot == dot2) %>%
#       ggplot(aes(x = x, y = y)) +
#       geom_point() +
#       coord_cartesian(xlim = c(-25, 25), ylim = c(-25, 25)) +
#       theme_minimal() +
#       labs(title = df$id[i]) + theme_blank

#     plots <- append(plots, list(p1))
#   }

#   patchwork::wrap_plots(plots, ncol = 1)
# }

# mc24_proto |> filter(file %in% unique(d$file[1])) %>% plot_dotsAll_orig()


```







## Stucture of dataframe after merging similarity ratings with CatLearn accuracy

- Each subject in the 2024 study has a similarity score for each of their 3 categories. (averaged over 2 comparisons with that categories prototype)
- The same category similarity scores are then compared to their accuracy for each of the 5 Pattern Tyeps (old, prototype, new low, new med, new high)
- 5 Pattern types * 3 Categories = 15 comparisons per subject

::: {#tbl-ee-brms1}
| sbjCode | condit | Category | Pattern.Type | n | CatLearn Accuracy | n_rating | Category Similarity | sd  |      item      |
|:-------:|:------:|:--------:|:------------:|:-:|:-----------------:|:--------:|:-------------------:|:---:|:--------------:|
|   316   |  high  |    1     |  prototype   | 1 |         1         |    60    |         5.1         | 2.6 | 316_high_1_285 |
|   316   |  high  |    1     |     old      | 9 |        0.4        |    60    |         5.1         | 2.6 | 316_high_1_285 |
|   316   |  high  |    1     |   new_low    | 3 |        0.3        |    60    |         5.1         | 2.6 | 316_high_1_285 |
|   316   |  high  |    1     |   new_med    | 6 |        0.3        |    60    |         5.1         | 2.6 | 316_high_1_285 |
|   316   |  high  |    1     |   new_high   | 9 |        0.4        |    60    |         5.1         | 2.6 | 316_high_1_285 |
|   316   |  high  |    2     |  prototype   | 1 |         0         |    60    |         3.8         |  2  | 316_high_2_327 |
|   316   |  high  |    2     |     old      | 9 |        0.7        |    60    |         3.8         |  2  | 316_high_2_327 |
|   316   |  high  |    2     |   new_low    | 3 |        0.7        |    60    |         3.8         |  2  | 316_high_2_327 |
|   316   |  high  |    2     |   new_med    | 6 |        0.2        |    60    |         3.8         |  2  | 316_high_2_327 |
|   316   |  high  |    2     |   new_high   | 9 |        0.2        |    60    |         3.8         |  2  | 316_high_2_327 |
|   316   |  high  |    3     |  prototype   | 1 |         0         |    60    |         5.2         | 2.4 | 316_high_3_287 |
|   316   |  high  |    3     |     old      | 9 |        0.1        |    60    |         5.2         | 2.4 | 316_high_3_287 |
|   316   |  high  |    3     |   new_low    | 3 |        0.7        |    60    |         5.2         | 2.4 | 316_high_3_287 |
|   316   |  high  |    3     |   new_med    | 6 |         0         |    60    |         5.2         | 2.4 | 316_high_3_287 |
|   316   |  high  |    3     |   new_high   | 9 |        0.3        |    60    |         5.2         | 2.4 | 316_high_3_287 |

Example of how data is structure after combining similarity ratings with CatLearn accuracy. 5 Pattern types * 3 Categories = 15 rows per subject in the 2024 study
{.striped .hover .sm}
::: 



```{r}
#| label: tbl-htw-modelError-e1
#| tbl-cap: "Example of how data is structure after combining similarity ratings with CatLearn accuracy. 5 Pattern types * 3 Categories = 15 rows per subject in the 2024 study"
#| eval: true

cat_sim_test %>% # round all numerics except sbjCode to 2 decimal places
 mutate(across(where(is.numeric), ~round(., 1))) |> select(-id,-sim_group,-item_label) |> 
  relocate(item,file, .after=sd) |>
  select(-Phase,-Block) |> 
  rename("Category Similarity" = resp, "CatLearn Accuracy" = Corr) |>
   DT::datatable(options = list(pageLength = 6))

# cat_sim_test %>% # round all numerics except sbjCode to 2 decimal places
#      mutate(across(where(is.numeric), ~round(., 1))) |> select(-id,-sim_group,-item_label) |> 
#      filter(sbjCode==316) |> 
#       relocate(item,file, .after=sd) |>
#       select(-Phase,-Block, -file) |> 
#       rename("Category Similarity" = resp, "CatLearn Accuracy" = Corr) |> pander::pandoc.table(style="rmarkdown",split.table=Inf)
```

© 2024 Thomas Gorman

site created with R and quarto

  • View source
  • Edit this page
  • Report an issue